import packages
read in file
we want to subset the dataset, since here are variables and values that we don’t need to include.
Values to leave out: - Data from years 2010-2011. There a lot of missing data from those years since air pollution was not a large concern then.
Variables to leave out: - PM measurements from the Jingan and Xuhui regions (PM_Jingan, PM_Xuhui). There are less reliable than PM measurements from the US consulate.
explanatory data analysis with each variable against outcome variable, PM_US.Post
par(mfrow=c(3,4))
plot(x = year, y = PM_US.Post, main="Year VS PM2.5 Concentration", xlab="Year", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~month,data=df, main="Month VS PM2.5 Concentration", xlab="Month", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~day,data=df, main="Day VS PM2.5 Concentration", xlab="Day", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~hour,data=df, main="Hour VS PM2.5 Concentration", xlab="Hour", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~as.factor(season),data=df, main="Season VS PM2.5 Concentration", xlab="Season (Spring, Summer, Fall, Winter)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = DEWP, y = PM_US.Post, main="Dew Point VS PM2.5 Concentration", xlab="Dew Point (ºC)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = HUMI, y = PM_US.Post, main="Humidity VS PM2.5 Concentration", xlab="Humidity (%)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = PRES, y = PM_US.Post, main="Pressure VS PM2.5 Concentration", xlab="Pressure (hPa)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = TEMP, y = PM_US.Post, main="Temperature VS PM2.5 Concentration", xlab="Temperature (ºC)", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~cbwd,data=df, main="Combined Wind Direction VS PM2.5 Concentration", xlab="Combined Wind Direction", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = Iws, y = PM_US.Post, main="Cumulated Wind Speed VS PM2.5 Concentration", xlab="Cumulated wind speed (m/s)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = precipitation, y = PM_US.Post, main="Hourly Percipitation VS PM2.5 Concentration", xlab="Hourly Precipitation (mm)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = Iprec, y = PM_US.Post, main="Cumulated Percipitation VS PM2.5 Concentration", xlab="Cumulated precipitation (mm)", ylab="PM2.5 Concentration (ug/m^3)")
# make first basic linear regression with all continous variables, no transformation
reg1= lm(PM_US.Post ~ year + as.factor(month) + as.factor(day) + as.factor(hour) + as.factor(season) +
DEWP + HUMI + PRES + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
summary(reg1)
##
## Call:
## lm(formula = PM_US.Post ~ year + as.factor(month) + as.factor(day) +
## as.factor(hour) + as.factor(season) + DEWP + HUMI + PRES +
## TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.84 -20.45 -5.98 12.63 594.96
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.651e+03 3.703e+02 12.560 < 2e-16 ***
## year -1.856e+00 1.850e-01 -10.030 < 2e-16 ***
## as.factor(month)2 -2.104e+01 9.703e-01 -21.679 < 2e-16 ***
## as.factor(month)3 -2.619e+01 9.946e-01 -26.334 < 2e-16 ***
## as.factor(month)4 -4.002e+01 1.193e+00 -33.532 < 2e-16 ***
## as.factor(month)5 -4.613e+01 1.363e+00 -33.858 < 2e-16 ***
## as.factor(month)6 -6.351e+01 1.554e+00 -40.876 < 2e-16 ***
## as.factor(month)7 -8.057e+01 1.807e+00 -44.585 < 2e-16 ***
## as.factor(month)8 -8.155e+01 1.738e+00 -46.909 < 2e-16 ***
## as.factor(month)9 -6.815e+01 1.537e+00 -44.342 < 2e-16 ***
## as.factor(month)10 -5.152e+01 1.330e+00 -38.721 < 2e-16 ***
## as.factor(month)11 -2.696e+01 1.092e+00 -24.698 < 2e-16 ***
## as.factor(month)12 2.453e-01 9.644e-01 0.254 0.799189
## as.factor(day)2 -6.691e-01 1.589e+00 -0.421 0.673781
## as.factor(day)3 -4.139e+00 1.568e+00 -2.639 0.008320 **
## as.factor(day)4 2.291e+00 1.578e+00 1.452 0.146581
## as.factor(day)5 -6.101e-01 1.576e+00 -0.387 0.698736
## as.factor(day)6 7.906e+00 1.577e+00 5.014 5.36e-07 ***
## as.factor(day)7 5.500e+00 1.573e+00 3.496 0.000473 ***
## as.factor(day)8 2.225e+00 1.585e+00 1.404 0.160267
## as.factor(day)9 3.899e-01 1.576e+00 0.247 0.804586
## as.factor(day)10 2.984e+00 1.574e+00 1.896 0.057985 .
## as.factor(day)11 3.865e+00 1.576e+00 2.453 0.014172 *
## as.factor(day)12 -4.410e-01 1.587e+00 -0.278 0.781109
## as.factor(day)13 -4.378e+00 1.588e+00 -2.757 0.005833 **
## as.factor(day)14 1.857e+00 1.574e+00 1.180 0.238123
## as.factor(day)15 1.251e+00 1.571e+00 0.796 0.425860
## as.factor(day)16 -5.062e-01 1.562e+00 -0.324 0.745901
## as.factor(day)17 -9.869e-01 1.566e+00 -0.630 0.528653
## as.factor(day)18 -2.417e+00 1.577e+00 -1.533 0.125299
## as.factor(day)19 3.033e+00 1.587e+00 1.911 0.056039 .
## as.factor(day)20 4.027e+00 1.576e+00 2.556 0.010606 *
## as.factor(day)21 -2.009e+00 1.584e+00 -1.268 0.204781
## as.factor(day)22 -1.095e+00 1.580e+00 -0.693 0.488370
## as.factor(day)23 2.701e+00 1.597e+00 1.691 0.090928 .
## as.factor(day)24 1.812e+00 1.591e+00 1.139 0.254722
## as.factor(day)25 -1.252e+00 1.575e+00 -0.795 0.426560
## as.factor(day)26 2.061e+00 1.577e+00 1.307 0.191347
## as.factor(day)27 -4.084e+00 1.584e+00 -2.578 0.009929 **
## as.factor(day)28 -1.747e+00 1.574e+00 -1.109 0.267232
## as.factor(day)29 7.766e-02 1.602e+00 0.048 0.961329
## as.factor(day)30 3.247e+00 1.624e+00 2.000 0.045530 *
## as.factor(day)31 -3.892e+00 1.830e+00 -2.127 0.033449 *
## as.factor(hour)1 -2.010e-01 1.404e+00 -0.143 0.886149
## as.factor(hour)2 -1.247e+00 1.407e+00 -0.886 0.375555
## as.factor(hour)3 -2.102e+00 1.412e+00 -1.489 0.136585
## as.factor(hour)4 -2.092e+00 1.415e+00 -1.478 0.139402
## as.factor(hour)5 -2.044e+00 1.414e+00 -1.446 0.148196
## as.factor(hour)6 2.788e-01 1.410e+00 0.198 0.843225
## as.factor(hour)7 8.426e-01 1.400e+00 0.602 0.547263
## as.factor(hour)8 1.144e+00 1.393e+00 0.821 0.411755
## as.factor(hour)9 -1.819e-01 1.400e+00 -0.130 0.896637
## as.factor(hour)10 -1.135e+00 1.413e+00 -0.803 0.421949
## as.factor(hour)11 -3.222e+00 1.424e+00 -2.263 0.023635 *
## as.factor(hour)12 -3.965e+00 1.434e+00 -2.764 0.005706 **
## as.factor(hour)13 -4.515e+00 1.439e+00 -3.136 0.001713 **
## as.factor(hour)14 -4.784e+00 1.443e+00 -3.314 0.000921 ***
## as.factor(hour)15 -3.738e+00 1.441e+00 -2.594 0.009500 **
## as.factor(hour)16 -2.205e+00 1.431e+00 -1.541 0.123394
## as.factor(hour)17 -8.222e-01 1.421e+00 -0.579 0.562777
## as.factor(hour)18 1.943e+00 1.409e+00 1.379 0.167913
## as.factor(hour)19 4.082e+00 1.404e+00 2.908 0.003636 **
## as.factor(hour)20 5.449e+00 1.402e+00 3.887 0.000102 ***
## as.factor(hour)21 4.072e+00 1.399e+00 2.911 0.003607 **
## as.factor(hour)22 3.452e+00 1.400e+00 2.466 0.013685 *
## as.factor(hour)23 1.666e+00 1.402e+00 1.188 0.234682
## as.factor(season)2 NA NA NA NA
## as.factor(season)3 NA NA NA NA
## as.factor(season)4 NA NA NA NA
## DEWP 3.505e+00 2.514e-01 13.942 < 2e-16 ***
## HUMI -8.815e-01 6.479e-02 -13.606 < 2e-16 ***
## PRES -7.206e-01 6.202e-02 -11.618 < 2e-16 ***
## TEMP -2.658e+00 2.445e-01 -10.869 < 2e-16 ***
## as.factor(cbwd)NE -2.243e+01 1.152e+00 -19.467 < 2e-16 ***
## as.factor(cbwd)NW 5.346e+00 1.189e+00 4.495 7.00e-06 ***
## as.factor(cbwd)SE -2.213e+01 1.175e+00 -18.836 < 2e-16 ***
## as.factor(cbwd)SW -4.802e+00 1.260e+00 -3.811 0.000139 ***
## Iws -9.519e-02 2.883e-03 -33.015 < 2e-16 ***
## precipitation -6.658e-01 2.076e-01 -3.207 0.001342 **
## Iprec -3.023e-01 3.071e-02 -9.843 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.72 on 31727 degrees of freedom
## Multiple R-squared: 0.3092, Adjusted R-squared: 0.3076
## F-statistic: 186.9 on 76 and 31727 DF, p-value: < 2.2e-16
# intuitively, we can remove the day variable since day of the month shouldn't really correlate with PM levels.
# some variables would be easier to interpret if mean centered, or have its values slightly changed in other ways
# dew point, humidity, pressure: to mean-center. We can interpret weather with average dew point, humidity & pressure
DEWP.c = DEWP - mean(DEWP)
HUMI.c = HUMI - mean(HUMI)
PRES.c = PRES - mean(PRES)
TEMP.c = TEMP - mean(TEMP)
# year: -2012, setting year 2012 as the baseline
year.n = year - 2012
# hour: changing it to a binary categorical variable, rush hour (7-10, 16-19 according to Shanghai Rush Hour Highway Regulations) & non-rush hour
n = nrow(df)
rush <- c(7:10, 16:19)
rushhour = rep(0, n)
rushhour[hour %in% rush] = 1
nonrushhour = rep(1, n)
nonrushhour[hour %in% rush] = 0
# since months and seasons basically have the same trend, this caused our regression to generate NAs.
# summary: adjusted some variables to aid interpretation
# next steps: fit linear regression with the above adjusted variables, and either seasons or months variable depending on which one fits better.
# regression with adjusted values & month variable, no season variable
reg2 = lm(PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
summary(reg2)
##
## Call:
## lm(formula = PM_US.Post ~ year.n + as.factor(month) + rushhour +
## nonrushhour + DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) +
## Iws + precipitation + Iprec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.26 -20.61 -6.08 12.54 596.68
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.99900 1.46587 77.769 < 2e-16 ***
## year.n -1.82193 0.18572 -9.810 < 2e-16 ***
## as.factor(month)2 -21.08202 0.97198 -21.690 < 2e-16 ***
## as.factor(month)3 -25.50259 0.99159 -25.719 < 2e-16 ***
## as.factor(month)4 -38.21605 1.16952 -32.677 < 2e-16 ***
## as.factor(month)5 -43.97492 1.31862 -33.349 < 2e-16 ***
## as.factor(month)6 -60.87114 1.50588 -40.422 < 2e-16 ***
## as.factor(month)7 -77.13446 1.72814 -44.634 < 2e-16 ***
## as.factor(month)8 -78.21257 1.65849 -47.159 < 2e-16 ***
## as.factor(month)9 -65.09787 1.46996 -44.285 < 2e-16 ***
## as.factor(month)10 -49.29032 1.28126 -38.470 < 2e-16 ***
## as.factor(month)11 -25.48690 1.07593 -23.688 < 2e-16 ***
## as.factor(month)12 0.53096 0.96804 0.548 0.583359
## rushhour 1.45581 0.42960 3.389 0.000703 ***
## nonrushhour NA NA NA NA
## DEWP.c 3.83552 0.24829 15.448 < 2e-16 ***
## HUMI.c -0.94115 0.06365 -14.786 < 2e-16 ***
## PRES.c -0.72969 0.06049 -12.063 < 2e-16 ***
## TEMP.c -3.14343 0.24025 -13.084 < 2e-16 ***
## as.factor(cbwd)NE -22.03000 1.14790 -19.192 < 2e-16 ***
## as.factor(cbwd)NW 5.02584 1.18854 4.229 2.36e-05 ***
## as.factor(cbwd)SE -21.05242 1.17118 -17.975 < 2e-16 ***
## as.factor(cbwd)SW -4.51206 1.26290 -3.573 0.000354 ***
## Iws -0.09517 0.00288 -33.050 < 2e-16 ***
## precipitation -0.71554 0.20821 -3.437 0.000590 ***
## Iprec -0.30198 0.03051 -9.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.89 on 31779 degrees of freedom
## Multiple R-squared: 0.3013, Adjusted R-squared: 0.3008
## F-statistic: 571.1 on 24 and 31779 DF, p-value: < 2.2e-16
# R^2: 0.3013
# regression with adjusted values & season variable, no month variable
reg3 = lm(PM_US.Post ~ year.n + as.factor(season) + rushhour + nonrushhour +
DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
summary(reg3)
##
## Call:
## lm(formula = PM_US.Post ~ year.n + as.factor(season) + rushhour +
## nonrushhour + DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) +
## Iws + precipitation + Iprec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.98 -21.53 -6.79 12.44 584.62
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.958328 1.228418 61.834 < 2e-16 ***
## year.n -1.568377 0.190209 -8.246 < 2e-16 ***
## as.factor(season)2 -19.819548 0.788907 -25.123 < 2e-16 ***
## as.factor(season)3 -6.218897 0.699650 -8.889 < 2e-16 ***
## as.factor(season)4 15.953218 0.734993 21.705 < 2e-16 ***
## rushhour 2.056458 0.441807 4.655 3.26e-06 ***
## nonrushhour NA NA NA NA
## DEWP.c 2.901329 0.252993 11.468 < 2e-16 ***
## HUMI.c -0.819754 0.065183 -12.576 < 2e-16 ***
## PRES.c -0.588082 0.059037 -9.961 < 2e-16 ***
## TEMP.c -3.489853 0.246837 -14.138 < 2e-16 ***
## as.factor(cbwd)NE -22.972442 1.178931 -19.486 < 2e-16 ***
## as.factor(cbwd)NW 5.362890 1.221952 4.389 1.14e-05 ***
## as.factor(cbwd)SE -20.111706 1.203243 -16.715 < 2e-16 ***
## as.factor(cbwd)SW -1.636168 1.295060 -1.263 0.2065
## Iws -0.096956 0.002948 -32.892 < 2e-16 ***
## precipitation -0.665778 0.214350 -3.106 0.0019 **
## Iprec -0.286124 0.031333 -9.132 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.96 on 31787 degrees of freedom
## Multiple R-squared: 0.2591, Adjusted R-squared: 0.2588
## F-statistic: 694.8 on 16 and 31787 DF, p-value: < 2.2e-16
# R^2: 0.2591
# the regression with months leads to a better model, as indicated by a 5% higher Rsquared value.
# intuitively, using a month variable instead of a season variable would lead to more precise predictions.
# we will get rid of the season variable and just use months as a categorical variable.
#check for multicollinearity between all continous variables
cont <- c(2, 8:11, 13:15)
contVars = df[cont]
cor(contVars)
## year DEWP HUMI PRES
## year 1.0000000000 1.723402e-02 0.01399557 0.01825623
## DEWP 0.0172340233 1.000000e+00 0.41646008 -0.85850461
## HUMI 0.0139955717 4.164601e-01 1.00000000 -0.22420056
## PRES 0.0182562321 -8.585046e-01 -0.22420056 1.00000000
## TEMP 0.0137383992 8.841011e-01 -0.04750197 -0.83909624
## Iws -0.0597930399 -4.870014e-05 0.03955909 0.01072049
## precipitation -0.0004504604 8.737077e-02 0.15023012 -0.09335344
## Iprec -0.0117851657 8.081718e-02 0.16947617 -0.08746455
## TEMP Iws precipitation Iprec
## year 0.01373840 -5.979304e-02 -0.0004504604 -0.01178517
## DEWP 0.88410110 -4.870014e-05 0.0873707654 0.08081718
## HUMI -0.04750197 3.955909e-02 0.1502301226 0.16947617
## PRES -0.83909624 1.072049e-02 -0.0933534449 -0.08746455
## TEMP 1.00000000 -2.475607e-02 0.0289201440 0.01421182
## Iws -0.02475607 1.000000e+00 0.0443211028 0.05878068
## precipitation 0.02892014 4.432110e-02 1.0000000000 0.43006888
## Iprec 0.01421182 5.878068e-02 0.4300688789 1.00000000
# multicollinearity (>0.8) between: pressure & dew point, pressure & temperature, dew point & temperature
# multicollinear variables: pressure, dew point, temperature
# this makes intuitive sense because the three weather factors are almost direct factors of each other
# fortunately, these variables are not messing up the standard errors, so we don't have to remove them
# summary: after looking at multi-collinearity values bewteen variables, we can consider removing: pressure, dew point.
# next step: We will see which multicollinearity variables we can remove by manually doing some f-tests.
# we will manually use nested f-tests to see if some of the variables can be removed from our regression
# variables to test: pressure, dew point, temperature, days
# reg2 = lm(PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
# DEWP.c + HUMI.c + PRES.c + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
# regression with no pressure variable
reg_noPRES = lm(PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
DEWP.c + HUMI.c + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
# regression with no dew point variable
reg_noDEWP = lm(PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
HUMI.c + PRES.c + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
#regression with no temperature
reg_noTEMP = lm(PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
DEWP.c + HUMI.c + PRES.c + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
# do ANOVA f-test on each of the regressions (VS original regression)
anova(reg2, reg_noPRES)
## Analysis of Variance Table
##
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + TEMP + as.factor(cbwd) + Iws + precipitation +
## Iprec
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 31779 40939572
## 2 31780 41127039 -1 -187467 145.52 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reg2, reg_noDEWP)
## Analysis of Variance Table
##
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## HUMI.c + PRES.c + TEMP + as.factor(cbwd) + Iws + precipitation +
## Iprec
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 31779 40939572
## 2 31780 41247004 -1 -307433 238.64 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reg2, reg_noTEMP)
## Analysis of Variance Table
##
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + PRES.c + as.factor(cbwd) + Iws + precipitation +
## Iprec
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 31779 40939572
## 2 31780 41160118 -1 -220546 171.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summary: we aren't able to manually remove any of the multicollinear variables by f-tests
# next step: try automatically removing variables by backwards selection
# first backwards selection: removed baseline nonrushhour variable
reg4 <- step(reg2,direction="backward")
## Start: AIC=227774.9
## PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec
##
##
## Step: AIC=227774.9
## PM_US.Post ~ year.n + as.factor(month) + rushhour + DEWP.c +
## HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation +
## Iprec
##
## Df Sum of Sq RSS AIC
## <none> 40939572 227775
## - rushhour 1 14794 40954366 227784
## - precipitation 1 15215 40954787 227785
## - year.n 1 123974 41063546 227869
## - Iprec 1 126185 41065756 227871
## - PRES.c 1 187467 41127039 227918
## - TEMP.c 1 220546 41160118 227944
## - HUMI.c 1 281632 41221204 227991
## - DEWP.c 1 307433 41247004 228011
## - Iws 1 1407130 42346701 228848
## - as.factor(cbwd) 4 3420722 44360294 230319
## - as.factor(month) 11 3935770 44875342 230672
summary(reg4)
##
## Call:
## lm(formula = PM_US.Post ~ year.n + as.factor(month) + rushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.26 -20.61 -6.08 12.54 596.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.99900 1.46587 77.769 < 2e-16 ***
## year.n -1.82193 0.18572 -9.810 < 2e-16 ***
## as.factor(month)2 -21.08202 0.97198 -21.690 < 2e-16 ***
## as.factor(month)3 -25.50259 0.99159 -25.719 < 2e-16 ***
## as.factor(month)4 -38.21605 1.16952 -32.677 < 2e-16 ***
## as.factor(month)5 -43.97492 1.31862 -33.349 < 2e-16 ***
## as.factor(month)6 -60.87114 1.50588 -40.422 < 2e-16 ***
## as.factor(month)7 -77.13446 1.72814 -44.634 < 2e-16 ***
## as.factor(month)8 -78.21257 1.65849 -47.159 < 2e-16 ***
## as.factor(month)9 -65.09787 1.46996 -44.285 < 2e-16 ***
## as.factor(month)10 -49.29032 1.28126 -38.470 < 2e-16 ***
## as.factor(month)11 -25.48690 1.07593 -23.688 < 2e-16 ***
## as.factor(month)12 0.53096 0.96804 0.548 0.583359
## rushhour 1.45581 0.42960 3.389 0.000703 ***
## DEWP.c 3.83552 0.24829 15.448 < 2e-16 ***
## HUMI.c -0.94115 0.06365 -14.786 < 2e-16 ***
## PRES.c -0.72969 0.06049 -12.063 < 2e-16 ***
## TEMP.c -3.14343 0.24025 -13.084 < 2e-16 ***
## as.factor(cbwd)NE -22.03000 1.14790 -19.192 < 2e-16 ***
## as.factor(cbwd)NW 5.02584 1.18854 4.229 2.36e-05 ***
## as.factor(cbwd)SE -21.05242 1.17118 -17.975 < 2e-16 ***
## as.factor(cbwd)SW -4.51206 1.26290 -3.573 0.000354 ***
## Iws -0.09517 0.00288 -33.050 < 2e-16 ***
## precipitation -0.71554 0.20821 -3.437 0.000590 ***
## Iprec -0.30198 0.03051 -9.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.89 on 31779 degrees of freedom
## Multiple R-squared: 0.3013, Adjusted R-squared: 0.3008
## F-statistic: 571.1 on 24 and 31779 DF, p-value: < 2.2e-16
# since no real variables have been removed, the two models are the same. the anova p-value is 0, indicates no change
anova(reg2, reg4)
## Analysis of Variance Table
##
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + DEWP.c +
## HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation +
## Iprec
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 31779 40939572
## 2 31779 40939572 0 0
# no further variables removed (other than clarifying baseline for nonrushhour variable), nothing smaller than the given AIC
# as shown in the exploratory data analysis, we should do some transformations as many linear regression assumptions are violated
# have to add 0.1 to transformed x-variables since we cannot log (0)
# x-variables to transform (non-linearity): log(Iws+0.1), log(precipitation+0.1), log(Iprec+0.1)
Iws.log = log(Iws+0.1)
precipitation.log = log(precipitation+0.1)
Iprec.log = log(Iprec+0.1)
# x-variables to square (non-linearity)
DEWP.c.2 = DEWP.c^2
HUMI.c.2 = HUMI.c^2
PRES.c.2 = PRES.c^2
TEMP.c.2 = TEMP.c^2
# y-varable to transform (non-constant variance): log(PM_US.Post)
PM_US.Post.log = log(PM_US.Post)
# try with these transformations
boxplot(log(PM_US.Post)~month,data=df, main="Month VS log PM2.5 Concentration", xlab="Month", ylab="log PM2.5 Concentration (ug/m^3)")
boxplot(log(PM_US.Post)~rushhour,data=df, main="Rush Hour VS log PM2.5 Concentration", xlab="Rush Hour", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = DEWP.c.2 + DEWP.c, y = log(PM_US.Post), main="Dew Point Centered VS log PM2.5 Concentration", xlab="Dew Point Centered (ºC)", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = HUMI.c.2 + HUMI.c, y = log(PM_US.Post), main="Humidity Centered VS PM2.5 Concentration", xlab="Humidity Centered (%)", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = PRES.c.2 + PRES.c, y = log(PM_US.Post), main="Pressure Centered VS PM2.5 Concentration", xlab="Pressure Centered (hPa)", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = TEMP.c.2 + TEMP.c, y = log(PM_US.Post), main="Temperature VS log PM2.5 Concentration", xlab="Temperature (ºC)", ylab="log PM2.5 Concentration (ug/m^3)")
boxplot(log(PM_US.Post)~cbwd,data=df, main="Combined Wind Direction VS log PM2.5 Concentration", xlab="Combined Wind Direction", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = log(Iws+0.1), y = log(PM_US.Post), main="log Cumulated Wind Speed VS log PM2.5 Concentration", xlab="log Cumulated wind speed (m/s)", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = log(precipitation+0.1), y = log(PM_US.Post), main="log Hourly Percipitation VS log PM2.5 Concentration", xlab="log Hourly Precipitation (mm)", ylab="log PM2.5 Concentration (ug/m^3)")
plot(x = log(Iprec+0.1), y = log(PM_US.Post), main="log Cumulated Percipitation VS log PM2.5 Concentration", xlab="log Cumulated precipitation (mm)", ylab="log PM2.5 Concentration (ug/m^3)")
# these transformations are not ideal but they're fine
# overall, they fix the non-linear and non-constant variance violations we were concerned about
# regression with just log y
reg5 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c +
HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation +
Iprec, data = df)
summary(reg5)
##
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws +
## precipitation + Iprec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0838 -0.3750 0.0156 0.4024 2.7291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.489e+01 6.495e+00 2.293 0.0218 *
## year -5.075e-03 3.226e-03 -1.573 0.1156
## as.factor(month)2 -2.781e-01 1.688e-02 -16.477 < 2e-16 ***
## as.factor(month)3 -3.029e-01 1.722e-02 -17.587 < 2e-16 ***
## as.factor(month)4 -4.497e-01 2.031e-02 -22.141 < 2e-16 ***
## as.factor(month)5 -5.700e-01 2.290e-02 -24.888 < 2e-16 ***
## as.factor(month)6 -9.656e-01 2.615e-02 -36.922 < 2e-16 ***
## as.factor(month)7 -1.462e+00 3.001e-02 -48.718 < 2e-16 ***
## as.factor(month)8 -1.534e+00 2.880e-02 -53.269 < 2e-16 ***
## as.factor(month)9 -1.163e+00 2.553e-02 -45.572 < 2e-16 ***
## as.factor(month)10 -7.567e-01 2.225e-02 -34.004 < 2e-16 ***
## as.factor(month)11 -3.149e-01 1.869e-02 -16.852 < 2e-16 ***
## as.factor(month)12 -4.991e-03 1.681e-02 -0.297 0.7666
## rushhour 5.029e-02 7.461e-03 6.740 1.61e-11 ***
## DEWP.c 6.233e-02 4.312e-03 14.454 < 2e-16 ***
## HUMI.c -1.757e-02 1.105e-03 -15.896 < 2e-16 ***
## PRES.c -1.316e-02 1.051e-03 -12.529 < 2e-16 ***
## TEMP.c -5.363e-02 4.172e-03 -12.853 < 2e-16 ***
## as.factor(cbwd)NE -3.782e-01 1.994e-02 -18.970 < 2e-16 ***
## as.factor(cbwd)NW 9.796e-02 2.064e-02 4.746 2.09e-06 ***
## as.factor(cbwd)SE -3.763e-01 2.034e-02 -18.500 < 2e-16 ***
## as.factor(cbwd)SW -2.096e-02 2.193e-02 -0.956 0.3392
## Iws -2.435e-03 5.001e-05 -48.695 < 2e-16 ***
## precipitation -1.430e-02 3.616e-03 -3.955 7.66e-05 ***
## Iprec -1.040e-02 5.299e-04 -19.618 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6234 on 31779 degrees of freedom
## Multiple R-squared: 0.4013, Adjusted R-squared: 0.4008
## F-statistic: 887.5 on 24 and 31779 DF, p-value: < 2.2e-16
# R^2: 0.4012
# regression with log y & x^2
reg6 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c + DEWP.c.2 +
HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 + TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws + precipitation +
Iprec, data = df)
summary(reg6)
##
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour +
## DEWP.c + DEWP.c.2 + HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 +
## TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws + precipitation +
## Iprec, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0263 -0.3706 0.0167 0.4030 2.6933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.202e+00 6.528e+00 0.950 0.34211
## year -7.102e-04 3.242e-03 -0.219 0.82661
## as.factor(month)2 -2.994e-01 1.685e-02 -17.769 < 2e-16 ***
## as.factor(month)3 -3.164e-01 1.750e-02 -18.083 < 2e-16 ***
## as.factor(month)4 -4.504e-01 2.071e-02 -21.747 < 2e-16 ***
## as.factor(month)5 -5.287e-01 2.303e-02 -22.960 < 2e-16 ***
## as.factor(month)6 -8.330e-01 2.711e-02 -30.731 < 2e-16 ***
## as.factor(month)7 -1.302e+00 3.259e-02 -39.963 < 2e-16 ***
## as.factor(month)8 -1.376e+00 3.147e-02 -43.715 < 2e-16 ***
## as.factor(month)9 -1.086e+00 2.655e-02 -40.895 < 2e-16 ***
## as.factor(month)10 -7.340e-01 2.278e-02 -32.227 < 2e-16 ***
## as.factor(month)11 -2.989e-01 1.933e-02 -15.464 < 2e-16 ***
## as.factor(month)12 1.764e-02 1.681e-02 1.049 0.29404
## rushhour 5.288e-02 7.415e-03 7.132 1.01e-12 ***
## DEWP.c -5.596e-02 1.364e-02 -4.103 4.09e-05 ***
## DEWP.c.2 -3.673e-04 7.527e-05 -4.880 1.06e-06 ***
## HUMI.c 7.336e-03 3.102e-03 2.365 0.01804 *
## HUMI.c.2 -2.998e-04 3.428e-05 -8.745 < 2e-16 ***
## PRES.c -1.433e-02 1.055e-03 -13.579 < 2e-16 ***
## PRES.c.2 -4.121e-04 5.763e-05 -7.150 8.89e-13 ***
## TEMP.c 5.143e-02 1.298e-02 3.961 7.47e-05 ***
## TEMP.c.2 1.584e-04 7.577e-05 2.090 0.03658 *
## as.factor(cbwd)NE -3.811e-01 1.987e-02 -19.175 < 2e-16 ***
## as.factor(cbwd)NW 1.111e-01 2.053e-02 5.412 6.28e-08 ***
## as.factor(cbwd)SE -3.785e-01 2.028e-02 -18.660 < 2e-16 ***
## as.factor(cbwd)SW -2.230e-02 2.182e-02 -1.022 0.30676
## Iws -2.381e-03 4.999e-05 -47.624 < 2e-16 ***
## precipitation -9.495e-03 3.603e-03 -2.635 0.00841 **
## Iprec -9.259e-03 5.311e-04 -17.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6194 on 31775 degrees of freedom
## Multiple R-squared: 0.409, Adjusted R-squared: 0.4085
## F-statistic: 785.4 on 28 and 31775 DF, p-value: < 2.2e-16
# R^2: 0.409
# regression with log y & log x
reg7 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c +
HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws.log + precipitation.log +
Iprec.log, data = df)
summary(reg7)
##
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour +
## DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws.log +
## precipitation.log + Iprec.log, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9620 -0.3731 0.0165 0.4060 2.8044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.896950 6.498430 1.369 0.17098
## year -0.002233 0.003227 -0.692 0.48909
## as.factor(month)2 -0.259151 0.016899 -15.335 < 2e-16 ***
## as.factor(month)3 -0.308689 0.017242 -17.903 < 2e-16 ***
## as.factor(month)4 -0.462195 0.020340 -22.723 < 2e-16 ***
## as.factor(month)5 -0.599272 0.022933 -26.131 < 2e-16 ***
## as.factor(month)6 -1.005824 0.026209 -38.377 < 2e-16 ***
## as.factor(month)7 -1.556281 0.030017 -51.847 < 2e-16 ***
## as.factor(month)8 -1.607217 0.028849 -55.712 < 2e-16 ***
## as.factor(month)9 -1.217443 0.025567 -47.618 < 2e-16 ***
## as.factor(month)10 -0.803986 0.022265 -36.110 < 2e-16 ***
## as.factor(month)11 -0.337652 0.018708 -18.048 < 2e-16 ***
## as.factor(month)12 -0.013597 0.016832 -0.808 0.41922
## rushhour 0.054326 0.007474 7.269 3.72e-13 ***
## DEWP.c 0.044304 0.004433 9.994 < 2e-16 ***
## HUMI.c -0.011708 0.001150 -10.180 < 2e-16 ***
## PRES.c -0.012585 0.001052 -11.961 < 2e-16 ***
## TEMP.c -0.033409 0.004275 -7.814 5.69e-15 ***
## as.factor(cbwd)NE -0.046229 0.022496 -2.055 0.03989 *
## as.factor(cbwd)NW 0.427626 0.022902 18.672 < 2e-16 ***
## as.factor(cbwd)SE -0.058612 0.022652 -2.587 0.00967 **
## as.factor(cbwd)SW 0.272054 0.023293 11.680 < 2e-16 ***
## Iws.log -0.108802 0.002580 -42.170 < 2e-16 ***
## precipitation.log 0.030167 0.011755 2.566 0.01028 *
## Iprec.log -0.123121 0.007241 -17.003 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6241 on 31779 degrees of freedom
## Multiple R-squared: 0.3998, Adjusted R-squared: 0.3994
## F-statistic: 882.1 on 24 and 31779 DF, p-value: < 2.2e-16
# R^2: 0.3998
# regression with log y, log x and x^2
reg8 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c + DEWP.c.2 +
HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 + TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws.log + precipitation.log +
Iprec.log, data = df)
summary(reg8)
##
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour +
## DEWP.c + DEWP.c.2 + HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 +
## TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws.log + precipitation.log +
## Iprec.log, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8897 -0.3703 0.0161 0.4015 2.7521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.078e+00 6.525e+00 0.318 0.750171
## year 1.203e-03 3.241e-03 0.371 0.710571
## as.factor(month)2 -2.878e-01 1.685e-02 -17.077 < 2e-16 ***
## as.factor(month)3 -3.332e-01 1.749e-02 -19.049 < 2e-16 ***
## as.factor(month)4 -4.710e-01 2.072e-02 -22.736 < 2e-16 ***
## as.factor(month)5 -5.535e-01 2.303e-02 -24.035 < 2e-16 ***
## as.factor(month)6 -8.429e-01 2.712e-02 -31.081 < 2e-16 ***
## as.factor(month)7 -1.343e+00 3.259e-02 -41.228 < 2e-16 ***
## as.factor(month)8 -1.398e+00 3.149e-02 -44.396 < 2e-16 ***
## as.factor(month)9 -1.124e+00 2.656e-02 -42.303 < 2e-16 ***
## as.factor(month)10 -7.844e-01 2.276e-02 -34.470 < 2e-16 ***
## as.factor(month)11 -3.331e-01 1.932e-02 -17.239 < 2e-16 ***
## as.factor(month)12 5.897e-03 1.681e-02 0.351 0.725658
## rushhour 5.696e-02 7.419e-03 7.677 1.67e-14 ***
## DEWP.c -2.081e-02 1.376e-02 -1.512 0.130520
## DEWP.c.2 -6.978e-04 7.593e-05 -9.189 < 2e-16 ***
## HUMI.c 7.662e-04 3.118e-03 0.246 0.805902
## HUMI.c.2 -1.588e-04 3.511e-05 -4.523 6.12e-06 ***
## PRES.c -1.348e-02 1.055e-03 -12.778 < 2e-16 ***
## PRES.c.2 -5.186e-04 5.749e-05 -9.020 < 2e-16 ***
## TEMP.c 1.930e-02 1.310e-02 1.473 0.140766
## TEMP.c.2 3.276e-04 7.641e-05 4.287 1.82e-05 ***
## as.factor(cbwd)NE -5.371e-02 2.239e-02 -2.398 0.016478 *
## as.factor(cbwd)NW 4.382e-01 2.275e-02 19.261 < 2e-16 ***
## as.factor(cbwd)SE -6.344e-02 2.255e-02 -2.813 0.004913 **
## as.factor(cbwd)SW 2.692e-01 2.314e-02 11.634 < 2e-16 ***
## Iws.log -1.074e-01 2.568e-03 -41.835 < 2e-16 ***
## precipitation.log 4.046e-02 1.168e-02 3.464 0.000533 ***
## Iprec.log -1.212e-01 7.257e-03 -16.696 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6194 on 31775 degrees of freedom
## Multiple R-squared: 0.409, Adjusted R-squared: 0.4084
## F-statistic: 785.2 on 28 and 31775 DF, p-value: < 2.2e-16
# R^2: 0.409
# the higher R-squared value from this regression implies that these transformed models are better fits than the previous model
# within the transformed regressions above, there's not much difference in the R^2 values. for interpretation purposes, we will want to work with regression 5 (log y, no x transformations) which will make explaining co-efficients easier. But we need to check with residual plots.
# sumamry: fitted regression with tranformed variables: logy, x^2 (some), logx (some)
# next step: check for interaction variables
# we want to do residual plots for each of the regressions above (reg 5,6,7,8)
# regression 5 (log y, no x transformations)
# variables that didn't need transformations
plot(y = reg5$residual, x=year, main = "Reg 5, Year VS Residuals", ylab = "Residuals")
boxplot(reg5$residual ~ as.factor(month), main = "Reg 5, Plot of Residuals", ylab = "Residuals")
boxplot(reg5$residual ~ rushhour, main = "Reg 5, Rushhour of Residuals", ylab = "Residuals")
boxplot(reg5$residual ~ as.factor(cbwd), main = "Reg 5, Combined Wind Direction of Residuals", ylab = "Residuals")
# variables possibly to be squared
plot(y = reg5$residual, x=DEWP.c, main = "Dew Point VS Residuals", ylab = "Residuals")
plot(y = reg5$residual, x=HUMI.c, main = "Humidity VS Residuals", ylab = "Residuals")
plot(y = reg5$residual, x=PRES.c, main = "Pressure VS Residuals", ylab = "Residuals")
plot(y = reg5$residual, x=TEMP.c, main = "Temperature VS Residuals", ylab = "Residuals")
# variables possiblly to be logged
plot(y = reg5$residual, x=Iws, main = "Cumulative Wind Speed VS Residuals", ylab = "Residuals")
plot(y = reg5$residual, x=precipitation, main = "Hourly Precipitation VS Residuals", ylab = "Residuals")
plot(y = reg5$residual, x=Iprec, main = "Cumulative Precipitation VS Residuals", ylab = "Residuals")
# these don't look great, definately need some sort of transformation
# x^2 variables
plot(y = reg6$residual, x=DEWP.c + DEWP.c.2, main = "Dew Point^2 VS Residuals", ylab = "Residuals")
plot(y = reg6$residual, x=HUMI.c + HUMI.c.2, main = "Humidity^2 VS Residuals", ylab = "Residuals")
plot(y = reg6$residual, x=PRES.c + PRES.c.2, main = "Pressure^2 VS Residuals", ylab = "Residuals")
plot(y = reg6$residual, x=TEMP.c + TEMP.c.2, main = "Temperature^2 VS Residuals", ylab = "Residuals")
# log x variables
plot(y = reg7$residual, x=Iws.log, main = "log Cumulative Wind Speed VS Residuals", ylab = "Residuals")
plot(y = reg7$residual, x=precipitation.log, main = "log Hourly Precipitation VS Residuals", ylab = "Residuals")
plot(y = reg7$residual, x=Iprec.log, main = "log Cumulative Precipitation VS Residuals", ylab = "Residuals")
# we can use xyplots to eye-ball possible interactions, testing variables as conditionals of various categorical variables
# variables to test: year, month, rushhour, humidity, temperature (dew point & pressure highly correlated), combined wind direction
# all other variables are continous with too many different levels
# lets see if any variables interact with year
xyplot(PM_US.Post.log ~ as.factor(month) | year)
xyplot(PM_US.Post.log ~ rushhour | year)
xyplot(PM_US.Post.log ~ DEWP.c | year)
xyplot(PM_US.Post.log ~ HUMI.c | year)
xyplot(PM_US.Post.log ~ PRES.c | year)
xyplot(PM_US.Post.log ~ TEMP | year)
xyplot(PM_US.Post.log ~ as.factor(cbwd) | year)
xyplot(PM_US.Post.log ~ Iws.log | year)
xyplot(PM_US.Post.log ~ precipitation.log | year)
xyplot(PM_US.Post.log ~ Iprec.log | year)
#no interactions with year
# lets see if any variables interact with month
xyplot(PM_US.Post.log ~ year | as.factor(month))
xyplot(PM_US.Post.log ~ rushhour | as.factor(month))
xyplot(PM_US.Post.log ~ DEWP.c | as.factor(month))
xyplot(PM_US.Post.log ~ HUMI.c | as.factor(month))
xyplot(PM_US.Post.log ~ PRES.c | as.factor(month))
xyplot(PM_US.Post.log ~ TEMP | as.factor(month))
xyplot(PM_US.Post.log ~ as.factor(cbwd) | as.factor(month))
xyplot(PM_US.Post.log ~ Iws.log | as.factor(month))
xyplot(PM_US.Post.log ~ precipitation.log | as.factor(month))
xyplot(PM_US.Post.log ~ Iprec.log | as.factor(month))
#
# lets see if any variables interact with rushhour
xyplot(PM_US.Post.log ~ year | rushhour)
xyplot(PM_US.Post.log ~ as.factor(month) | rushhour)
xyplot(PM_US.Post.log ~ DEWP.c | rushhour)
xyplot(PM_US.Post.log ~ HUMI.c | rushhour)
xyplot(PM_US.Post.log ~ PRES.c | rushhour)
xyplot(PM_US.Post.log ~ TEMP | rushhour)
xyplot(PM_US.Post.log ~ as.factor(cbwd) | rushhour)
xyplot(PM_US.Post.log ~ Iws.log | rushhour)
xyplot(PM_US.Post.log ~ precipitation.log | rushhour)
xyplot(PM_US.Post.log ~ Iprec.log | rushhour)
# no variables seems to interact with rushhour
# lets see if any variables interact with humidity
# split humidity into categorical variable with 6 levels
xyplot(PM_US.Post.log ~ year | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ as.factor(month) | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ rushhour | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ DEWP.c | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ PRES.c | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ TEMP | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ Iws.log | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ precipitation.log | cut(HUMI.c, 6))
xyplot(PM_US.Post.log ~ Iprec.log | cut(HUMI.c, 6))
#???????
# lets see if any variables interact with temperature
# split temperature into categorical variable with 6 levels
xyplot(PM_US.Post.log ~ year | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ as.factor(month) | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ rushhour | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ DEWP.c | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ HUMI.c | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ PRES.c | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ Iws.log | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ precipitation.log | cut(TEMP, 6))
xyplot(PM_US.Post.log ~ Iprec.log | cut(TEMP, 6))
#???????
# temperature has high multicollinearity with pressure & dew point
# we can just use the temperature variable to account for pressure & dew point
# lets see if any variables interact with combined wind direction
xyplot(PM_US.Post.log ~ year | as.factor(cbwd))
xyplot(PM_US.Post.log ~ as.factor(month) | as.factor(cbwd))
xyplot(PM_US.Post.log ~ rushhour | as.factor(cbwd))
xyplot(PM_US.Post.log ~ DEWP.c | as.factor(cbwd))
xyplot(PM_US.Post.log ~ HUMI.c | as.factor(cbwd))
xyplot(PM_US.Post.log ~ PRES.c | as.factor(cbwd))
xyplot(PM_US.Post.log ~ TEMP | as.factor(cbwd))
xyplot(PM_US.Post.log ~ Iws.log | as.factor(cbwd))
xyplot(PM_US.Post.log ~ precipitation.log | as.factor(cbwd))
xyplot(PM_US.Post.log ~ Iprec.log | as.factor(cbwd))
#Iws.log + precipitation.log + Iprec.log
# lets see if any variables interact with log cumulated wind speed
xyplot(PM_US.Post.log ~ year | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ as.factor(month) | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ rushhour | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ DEWP.c | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ HUMI.c | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ PRES.c | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ TEMP | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ precipitation.log | cut(Iws.log, 6))
xyplot(PM_US.Post.log ~ Iprec.log | cut(Iws.log, 6))
# no variables seem to interact with log cumulated wind speed
# lets see if any variables interact with log hourly precipitation
xyplot(PM_US.Post.log ~ year | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ as.factor(month) | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ rushhour | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ DEWP.c | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ HUMI.c | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ PRES.c | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ TEMP | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ Iws.log | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ precipitation.log | cut(precipitation.log, 6))
xyplot(PM_US.Post.log ~ Iprec.log | cut(precipitation.log, 6))
# no variables seem to interact with log hourly precipitation
# lets see if any variables interact with log cumulated precipitation
xyplot(PM_US.Post.log ~ year | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ as.factor(month) | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ rushhour | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ DEWP.c | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ HUMI.c | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ PRES.c | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ TEMP | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ Iws.log | cut(Iprec.log, 6))
xyplot(PM_US.Post.log ~ precipitation.log | cut(Iprec.log, 6))
# no variables seem to interact with log cumulated precipitation
# seems to be interaction between combined wind direction and log cumulated wind speed, make a var for that
# summary: after eye-balling for possible interaction effects amongst almost all of the variables, there doesn't seem to be any interaction effects. No need to make interaction variables.
# next step: decide on final regression and interpret coefficients